Question 1
A
library(ggplot2)
library(tidyverse)
rm(list=ls())
# Define x
x = seq(0,1, length.out=100)
#x = rnorm(100,0,1)
# We choose equally spaced notes over the range [0,1]
knots = seq(0,1,length.out=8) # we set this to 8 as the first and last knots will be removed, so this ensures we are left with 6
# Basis functions per Woods formulation
b1 = function(x) {
rep(1, length(x))
}
b2 = function(x){
x
}
R_x_z = function(x, z) {
((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 -
((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}
# Construct the basis matrix
basis_matrix = cbind(b1(x), b2(x),sapply(knots[-c(1,length(knots))],function(k) R_x_z(x,k)))
basis_design_matrix = basis_matrix %>% as_tibble()
colnames(basis_design_matrix) = c("b1", "b2", "b3", "b4", paste0("b", 5:(4 + length(knots))))
basis_design_matrix = basis_design_matrix %>%
mutate(x = x)
basis_melted <- reshape2::melt(basis_design_matrix, id.vars = "x")
ggplot(basis_melted, aes(x = x, y = value, color = variable)) +
geom_line(size = 1) +
labs(title = "Wood Cubic Spline Basis",
x = "x", y = "Basis Function Value", color = "Basis") +
theme_minimal()

B
set.seed(123)
n <- 100
x = rnorm(n,0,1)
y <- 5 + sin(3 * pi * (x-0.6)) + rnorm(n, sd = 0.5^2)
knots <- seq(min(x), max(x), length.out = 32) # We define length of 32 because the first and last knots will be removed, to ensure we have exactly 30 knots
# Construct penalty matrix
knot_positions <- knots[-c(1, length(knots))]
n_knots = length(knot_positions)
S = matrix(0, n_knots, n_knots)
for(i in 1:n_knots){
for (j in 1: n_knots){
S[i,j] = R_x_z(knot_positions[i], knot_positions[j])
}
}
penalized_regression_spline = function(lambda, S, knot_positions, y){
basis_matrix <- cbind(
b1(x),
b2(x),
sapply(knot_positions, function(k) R_x_z(x, k))
)
P = lambda * S
X = basis_matrix[, 3:ncol(basis_matrix)]
XtX_plus_p = t(X)%*%X + P
XtY = t(X)%*%y
beta_hat = solve(XtX_plus_p)%*%XtY
fitted_values = X %*% beta_hat
H = X %*% solve(XtX_plus_p) %*% t(X)
se_errors = sqrt(diag(H))
return (list(fitted_values = fitted_values , se_errors = se_errors))
}
fit_result = penalized_regression_spline(0.00000001, S, knot_positions,y) # choose a random lambda value
plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, se_errors=fit_result$se_errors)
ggplot(plot_data, aes(x = x, y = y)) +
geom_point(color = 'black', alpha = 0.6) +
geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
labs(title = "Penalized Regression Spline Fit",
x = "X",
y = "Y") +
theme_minimal()

C
# (95% confidence intervals)
upper <- fit_result$fitted_values + 1.96 * fit_result$se_errors
lower <- fit_result$fitted_values - 1.96 * fit_result$se_errors
plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, lower = lower, upper = upper)
ggplot(plot_data, aes(x = x, y = y)) +
geom_point(color = 'black', alpha = 0.6) +
geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.1, fill = 'blue') +
labs(title = "Penalized Regression Spline Fit with Confidence Bands",
x = "X",
y = "Y") +
theme_minimal()

D
compute_gcv = function(lambda){
basis_matrix <- cbind(
b1(x),
b2(x),
sapply(knot_positions, function(k) R_x_z(x, k))
)
P = lambda * S
X = basis_matrix[, 3:ncol(basis_matrix)]
XtX_plus_p = t(X)%*%X + P
XtY = t(X)%*%y
beta_hat = solve(XtX_plus_p)%*%XtY
fitted_values = X %*% beta_hat
# Compute residuals
residuals = y - fitted_values
H = X %*% solve(XtX_plus_p)%*%t(X)
edf = sum(diag(H))
# Compute the GCV score
n = length(y)
gcv = sum(residuals^2)/(n*(1-edf/n)^2)
return (gcv)
}
lambda_values <- seq(0.00000001, 1, length.out = 5000)
gcv_scores <- sapply(lambda_values, compute_gcv)
gcv_data <- data.frame(lambda = lambda_values, gcv = gcv_scores)
ggplot(gcv_data, aes(x = lambda, y = gcv)) +
geom_line(color = 'blue', size = 1.2) +
labs(title = "GCV as a Function of the Smoothing Parameter",
x = "Smoothing Parameter (lambda)",
y = "GCV Score") +
theme_minimal()

Question 2
Tensor Product Spline
library(gamair)
library(tidyverse)
library(dplyr)
rm(list=ls())
# Basis functions per Woods formulation
b1 = function(x) {
rep(1, length(x))
}
b2 = function(x){
x
}
R_x_z = function(x, z) {
((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 -
((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}
data("brain")
brain_data = brain%>%as_tibble()%>%
select(X,Y, medFPQ)
tensor_product_spline = function(x,y, response,x_knot_positions, y_knot_positions, x_penalty_matrix, y_penalty_matrix, lambda, penalize=FALSE) {
# Scale x and y
x_scaled = scale(x)
y_scaled = scale(y)
y_data = as.matrix(response)
x_basis_matrix <- cbind(sapply(x_knot_positions, function(k) R_x_z(x_scaled, k)))
y_basis_matrix <- cbind(sapply(y_knot_positions, function(k) R_x_z(y_scaled, k)))
tensor_basis_matrix = compute_tensor_product(x_basis_matrix, y_basis_matrix)
tensor_penalty_matrix = compute_tensor_product(x_penalty_matrix, y_penalty_matrix)
P = lambda * tensor_penalty_matrix
X = tensor_basis_matrix[1:nrow(y_data),]
XtX_plus_p = t(X)%*%X + P
XtY = t(X)%*%y_data
beta_hat = MASS::ginv(XtX_plus_p)%*%XtY
fitted_values = X %*% beta_hat
H = X %*% MASS::ginv(XtX_plus_p) %*% t(X)
se_errors = sqrt(diag(H))
return (list(fitted_values = fitted_values , se_errors = se_errors))
}
compute_tensor_product = function(A, B){
m1 = ncol(A)
m2 = ncol(B)
G = matrix(NA, nrow=nrow(A)*nrow(B), ncol = m1 * m2 )
ccol <- 1
for (j in 1:m1) {
for (k in 1:m2) {
G[, ccol] <- A[, j] * B[, k]
ccol <- ccol + 1
}
}
return (G)
}
x_knots <- seq(min(brain_data$X), max(brain$X), length.out = 10)
y_knots <- seq(min(brain_data$Y), max(brain$Y), length.out = 10)
# Construct penalty matrix
construct_penalty_matrix <- function(knots) {
n_knots <- length(knots)
S <- matrix(0, n_knots, n_knots)
for (i in 1:n_knots) {
for (j in 1:n_knots) {
S[i, j] <- R_x_z(knots[i], knots[j])
}
}
S
}
S_x = construct_penalty_matrix(x_knots)
S_y = construct_penalty_matrix(y_knots)
tensor_product_spline_result = tensor_product_spline(
brain_data$X,
brain_data$Y,
brain_data$medFPQ,
x_knots,
y_knots,
S_x,
S_y,
0.00001
)
# Visualize the original data
ggplot(brain_data, aes(x = X, y = Y)) +
geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
scale_color_viridis_c() +
labs(title = "Original Data",
x = "X",
y = "Y",
color = "Observed medFPQ") +
theme_minimal()

brain_data <- brain_data %>% mutate(Fitted = tensor_product_spline_result$fitted_values)
ggplot(brain_data, aes(x = X, y = Y)) +
geom_tile(aes(fill = Fitted)) +
scale_fill_viridis_c() +
labs(title = "Tensor Product Spline Fit",
x = "X",
y = "Y",
fill = "Fitted medFPQ") +
theme_minimal()

# Visualize the observed medFPQ values
ggplot(brain_data, aes(x = X, y = Y)) +
geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
scale_color_viridis_c() +
labs(title = "Observed medFPQ",
x = "X",
y = "Y",
color = "Observed medFPQ") +
theme_minimal()

plot_data <- data.frame(
X = brain_data$X,
Y = brain_data$Y,
Original = brain_data$medFPQ,
Fitted = tensor_product_spline_result$fitted_values
)
plot_data_long <- tidyr::pivot_longer(plot_data,
cols = c(Original, Fitted),
names_to = "Type",
values_to = "Value")
plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))
# Create the plot
ggplot(plot_data_long, aes(x = X, y = Y, fill = Value)) +
geom_tile() +
facet_wrap(~ Type) +
scale_fill_viridis_c() +
labs(title = "Tensor Product Original vs Fitted Values",
x = "X coordinate",
y = "Y coordinate",
fill = "medFPQ") +
theme_minimal() +
theme(aspect.ratio = 1)

Thin Plate Spline
library(gamair)
library(tidyverse)
library(dplyr)
rm(list=ls())
data("brain")
brain_data = brain %>% as_tibble() %>%
select(X, Y, medFPQ)
# Radial Basis function
tps_basis = function(r){
ifelse(r > 0, r^2 * log(r), 0)
}
create_design_matrix = function(data, knots){
n = nrow(data)
m = nrow(knots)
# Create matrices of coordinates
data_mat <- as.matrix(data[, 1:2])
knots_mat <- as.matrix(knots[, 1:2])
# Compute pairwise distances
distances <- fields::rdist(data_mat, knots_mat)
# Apply the thin-plate spline basis function to all distances at once
B <- apply(distances, c(1,2), tps_basis)
A = cbind(1, data)
X = cbind(A, B)
return (list(X = X, A = A, B = B))
}
thin_plate_spline = function(data, lambda, num_knots){
n = nrow(data)
knot_indices = seq(1, n, length.out = num_knots)
knots = data[knot_indices, 1:2]
# Create design matrix
design_matrix = create_design_matrix(data[,1:2], knots)
X = as.matrix(design_matrix$X)
A = design_matrix$A
B = design_matrix$B
# Create penalty matrix
m = ncol(B)
P = rbind(
cbind(matrix(0, 3, 3), matrix(0, 3, m)),
cbind(matrix(0, m, 3), B[knot_indices,])
)
y = as.matrix(data[, 3])
XtX_plus_p = t(X) %*% X + (lambda * P)
XtY = t(X) %*% y
beta_hat = solve(XtX_plus_p) %*% XtY
fitted_values = X %*% beta_hat
colnames(fitted_values) = "Fitted"
H = X %*% solve(XtX_plus_p) %*% t(X)
se_errors = sqrt(diag(H))
return (list(fitted_values = fitted_values, se_errors = se_errors))
}
tps_result = thin_plate_spline(brain_data, 0.5, 10)
head(tps_result$fitted_values)
## Fitted
## [1,] 1.4844050
## [2,] 1.2056177
## [3,] 1.3681456
## [4,] 1.4847700
## [5,] 1.5476221
## [6,] 0.9391614
# Visualize the fitted thin-plate spline without the original data
brain_data <- brain_data %>% mutate(Fitted = tps_result$fitted_values)
ggplot(brain_data, aes(x = X, y = Y)) +
geom_tile(aes(fill = Fitted)) +
scale_fill_viridis_c() +
labs(title = "Thin-Plate Spline Fit",
x = "X",
y = "Y",
fill = "Fitted medFPQ") +
theme_minimal()

# Reshape the data for plotting
plot_data <- data.frame(
X = brain_data$X,
Y = brain_data$Y,
Original = brain_data$medFPQ,
Fitted = tps_result$fitted_values
)
plot_data_long <- tidyr::pivot_longer(plot_data, cols = c(Original, Fitted), names_to = "Type", values_to = "medFPQ")
# Change the order of the factors in the Type column to show "Original" first
plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))
p = ggplot(plot_data_long, aes(x = X, y = Y, fill = medFPQ)) +
geom_tile() +
facet_wrap(~ Type) +
scale_fill_viridis_c() +
labs(title = "Thin-Plate Spline: Original vs Fitted Values",
x = "X",
y = "Y",
fill = "medFPQ") +
theme_minimal() +
theme(aspect.ratio = 1)
print(p)

ggsave("tps_original_vs_fitted.png", p, width = 12, height = 6)
Tensor Product and Thin Plate Spline: Using mgcv function
# Load necessary libraries
library(gamair)
library(mgcv)
library(tidyverse)
library(ggplot2)
# Load the data
data(brain)
brain_data <- brain %>%
as_tibble() %>%
select(X, Y, medFPQ)
# Fit the tensor-product spline model using mgcv
tp_model <- gam(medFPQ ~ te(X, Y, k = 10), data = brain_data)
# Fit the thin-plate spline model using mgcv
tps_model <- gam(medFPQ ~ s(X, Y, bs = "tp", k = 100), data = brain_data)
# Summarize the models
summary(tp_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medFPQ ~ te(X, Y, k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.24791 0.03003 41.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(X,Y) 90.09 91.84 9.934 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.347 Deviance explained = 38.4%
## GCV = 1.5007 Scale est. = 1.4135 n = 1567
summary(tps_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medFPQ ~ s(X, Y, bs = "tp", k = 100)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.24791 0.03029 41.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(X,Y) 86.49 96.33 8.503 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.335 Deviance explained = 37.2%
## GCV = 1.5232 Scale est. = 1.4381 n = 1567
# Predict using the mgcv models
brain_data$Fitted_tp <- predict(tp_model, newdata = brain_data)
brain_data$Fitted_tps <- predict(tps_model, newdata = brain_data)
# Plot the original data
original_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = medFPQ)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Original Data",
x = "X coordinate",
y = "Y coordinate",
fill = "medFPQ") +
theme_minimal()
# Plot the tensor-product spline fitted data
tp_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tp)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Tensor-Product Spline Fit using mgcv",
x = "X coordinate",
y = "Y coordinate",
fill = "Fitted medFPQ") +
theme_minimal()
# Plot the thin-plate spline fitted data
tps_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tps)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Thin-Plate Spline Fit using mgcv",
x = "X coordinate",
y = "Y coordinate",
fill = "Fitted medFPQ") +
theme_minimal()
# Save the plots
ggsave("original_plot.png", original_plot, width = 8, height = 6)
ggsave("tp_plot.png", tp_plot, width = 8, height = 6)
ggsave("tps_plot.png", tps_plot, width = 8, height = 6)
Question 3: GAMs and MARS
GAM and Logistic Regression
library(mgcv)
rm(list = ls())
pulsar_data = read.csv2("Pulsar.csv", header = TRUE, sep=",")
head(pulsar_data)
## Mean_Integrated SD EK Skewness Mean_DMSNR_Curve
## 1 140.5625 55.68378214 -0.234571412 -0.699648398 3.199832776
## 2 102.5078125 58.88243001 0.465318154 -0.515087909 1.677257525
## 3 103.015625 39.34164944 0.323328365 1.051164429 3.121237458
## 4 136.75 57.17844874 -0.068414638 -0.636238369 3.642976589
## 5 88.7265625 40.67222541 0.600866079 1.123491692 1.178929766
## 6 93.5703125 46.69811352 0.53190485 0.416721117 1.636287625
## SD_DMSNR_Curve EK_DMSNR_Curve Skewness_DMSNR_Curve Class
## 1 19.11042633 7.975531794 74.24222492 0
## 2 14.86014572 10.57648674 127.3935796 0
## 3 21.74466875 7.735822015 63.17190911 0
## 4 20.9592803 6.89649891 53.59366067 0
## 5 11.4687196 14.26957284 252.5673058 0
## 6 14.54507425 10.6217484 131.3940043 0
# Split the data into training and test sets (80% training, 20% test)
set.seed(10032024)
sample <- sample.int(n = nrow(pulsar_data), size = floor(.8*nrow(pulsar_data)), replace = F)
train <- pulsar_data[sample, ]
test <- pulsar_data[-sample, ]
# Separate features and target variable
X_train <- train[, !names(train) %in% "Class"]
y_train <- train$Class
X_test <- test[, !names(test) %in% "Class"]
y_test <- test$Class
# Fit Logistic Regession Model (GLM)
#log_regression_model = glm(Class ~., data = train, family = binomial)
#gam_model <- gam(Class ~ s(Mean_Integrated) + s(SD) + s(EK) + s(Skewness) + s(Mean_DMSNR_Curve) + s(SD_DMSNR_Curve) + #s(EK_DMSNR_Curve) + s(Skewness_DMSNR_Curve), data = training_data, family = binomial)
MARS
# library(earth)
#
# # Fit the MARS model
# mars_model <- earth(Class ~ ., data = train)
#
#
# y_pred <- predict(mars_model, newdata = X_test, type = "response")
#
# y_pred_binary <- ifelse(y_pred > 0.5, 1, 0)
#
# # Evaluate the model
# accuracy <- mean(y_pred_binary == y_test)
# cat("Accuracy:", accuracy, "\n")
#
# # Confusion matrix and classification report
# table(Predicted = y_pred_binary, Actual = y_test)
library(earth)
library(caret)
library(ggplot2)
rm(list = ls())
pulsar_data <- read.csv("Pulsar.csv", header = TRUE, sep = ",")
# Define the features (X) and the target (y)
X <- pulsar_data[, !(names(pulsar_data) %in% c("Class"))]
y <- pulsar_data$Class
# Split the data into training (80%) and testing (20%) sets
set.seed(0)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]
# Combine X and y for training
train_data <- data.frame(X_train, target = y_train)
# Initialize and fit the MARS model
mars_model <- earth(target ~ ., data = train_data)
# Predict on the test data
test_data <- data.frame(X_test)
y_pred_prob <- predict(mars_model, test_data)
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)
# Calculate the confusion matrix
conf_matrix <- confusionMatrix(as.factor(y_pred), as.factor(y_test))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3234 55
## 1 13 277
##
## Accuracy : 0.981
## 95% CI : (0.976, 0.9852)
## No Information Rate : 0.9072
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8803
##
## Mcnemar's Test P-Value : 6.627e-07
##
## Sensitivity : 0.9960
## Specificity : 0.8343
## Pos Pred Value : 0.9833
## Neg Pred Value : 0.9552
## Prevalence : 0.9072
## Detection Rate : 0.9036
## Detection Prevalence : 0.9190
## Balanced Accuracy : 0.9152
##
## 'Positive' Class : 0
##
# Extract performance metrics
accuracy <- conf_matrix$overall['Accuracy']
sensitivity <- conf_matrix$byClass['Sensitivity']
specificity <- conf_matrix$byClass['Specificity']
precision <- conf_matrix$byClass['Precision']
f1 <- conf_matrix$byClass['F1']
# Create a data frame to store the metrics
metrics <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision", "F1 Score"),
Value = c(accuracy, sensitivity, specificity, precision, f1)
)
# Print the metrics table
print(metrics)
## Metric Value
## Accuracy Accuracy 0.9810003
## Sensitivity Sensitivity 0.9959963
## Specificity Specificity 0.8343373
## Precision Precision 0.9832776
## F1 F1 Score 0.9895961
# Create the results data frame
results <- data.frame(Actual = y_test, Predicted_Prob = y_pred_prob)
# Print the structure and head of the results data frame
str(results)
## 'data.frame': 3579 obs. of 2 variables:
## $ Actual: int 0 0 0 0 0 0 0 0 0 0 ...
## $ target: num -0.01737 -0.02365 0.00249 -0.01425 0.04996 ...
head(results)
## Actual target
## 1 0 -0.01737459
## 2 0 -0.02364795
## 3 0 0.00248694
## 4 0 -0.01424527
## 5 0 0.04996036
## 6 0 -0.02077186
# # Visualize the results using ggplot2
# ggplot(results, aes(x = Actual, y = Predicted_Prob)) +
# geom_point(alpha = 0.5) +
# geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
# labs(title = "Actual vs Predicted Probabilities", x = "Actual values", y = "Predicted probabilities") +
# theme_minimal()
Question 4: Wavelets
library(fds)
library(wavelets)
library(glmnet)
library(ggplot2)
library(wavethresh)
library(tidyr)
rm(list = ls())
data(aa)
data(ao)
aa_data <- aa$y
ao_data <- ao$y
X <- rbind(aa_data, ao_data)
y <- c(rep(0, nrow(aa_data)), rep(1, nrow(ao_data)))
# Function to apply wavelet transform
apply_wavelet <- function(data, n_coef = 256) {
next_power_of_2 <- 2^ceiling(log2(length(data)))
padded_data <- c(data, rep(0, next_power_of_2 - length(data)))
wt <- wd(padded_data, filter.number = 10, family = "DaubLeAsymm")
all_levels <- 0:(nlevelsWT(wt) - 1)
all_coefs <- unlist(lapply(all_levels, function(l) accessD(wt, level = l)))
return(all_coefs[1:min(length(all_coefs), n_coef)])
}
# Apply wavelet transform to each row
X_wavelet <- t(apply(X, 1, apply_wavelet))
# Plot a sample of signals and their corresponding wavelet coefficients
plot_samples <- function(data, wavelet_data, title_signal, title_wavelet, sample_size = 10) {
sampled_indices <- sample(nrow(data), sample_size)
sampled_data <- data[sampled_indices, ]
sampled_wavelet_data <- wavelet_data[sampled_indices, ]
# Plot signals
signal_data <- as.data.frame(t(sampled_data))
signal_data$Index <- 1:nrow(signal_data)
signal_data_long <- pivot_longer(signal_data, -Index, names_to = "Sample", values_to = "Amplitude")
plot_signal <- ggplot(signal_data_long, aes(x = Index, y = Amplitude, color = Sample)) +
geom_line(alpha = 0.5) +
labs(title = title_signal, x = "Index", y = "Amplitude") +
theme_minimal()
# Plot wavelet coefficients
coeff_data <- as.data.frame(t(sampled_wavelet_data))
coeff_data$Index <- 1:nrow(coeff_data)
coeff_data_long <- pivot_longer(coeff_data, -Index, names_to = "Sample", values_to = "Coefficient")
plot_wavelet <- ggplot(coeff_data_long, aes(x = Index, y = Coefficient, color = Sample)) +
geom_line(alpha = 0.5) +
labs(title = title_wavelet, x = "Coefficient Index", y = "Coefficient Value") +
theme_minimal()
return(list(signal_plot = plot_signal, wavelet_plot = plot_wavelet))
}
# Plot for 'aa' class
aa_plots <- plot_samples(aa_data, t(apply(aa_data, 1, apply_wavelet)),
"Sample of Signals from 'aa' Class",
"Sample of Wavelet Coefficients for 'aa' Class")
print(aa_plots$signal_plot)

print(aa_plots$wavelet_plot)

# Plot for 'ao' class
ao_plots <- plot_samples(ao_data, t(apply(ao_data, 1, apply_wavelet)),
"Sample of Signals from 'ao' Class",
"Sample of Wavelet Coefficients for 'ao' Class")
print(ao_plots$signal_plot)

print(ao_plots$wavelet_plot)

# Split data
set.seed(123)
total_samples <- nrow(X_wavelet)
train_size <- floor(0.7 * total_samples)
train_indices <- sample(1:total_samples, train_size)
X_train <- X_wavelet[train_indices, ]
X_test <- X_wavelet[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]
# Fit the model
cv_fit <- cv.glmnet(X_train, y_train, family = "binomial", alpha = 1)
best_lambda <- cv_fit$lambda.min
model <- glmnet(X_train, y_train, family = "binomial", alpha = 1, lambda = best_lambda)
# Make predictions
predictions <- predict(model, X_test, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Calculate accuracy
accuracy <- mean(predicted_classes == y_test)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 1"
# Create confusion matrix
conf_matrix <- table(Predicted = predicted_classes, Actual = y_test)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 43 0
## 1 0 47
# Calculate evaluation metrics
precision <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[2, 1])
recall <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[1, 2])
f1_score <- 2 * precision * recall / (precision + recall)
metrics <- data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1 Score"),
Value = c(accuracy, precision, recall, f1_score)
)
print(metrics)
## Metric Value
## 1 Accuracy 1
## 2 Precision 1
## 3 Recall 1
## 4 F1 Score 1
# Get feature importances
coef_importance <- abs(coef(model)[-1])
top_features <- order(coef_importance, decreasing = TRUE)[1:10]
print("Top 10 most important wavelet coefficients:")
## [1] "Top 10 most important wavelet coefficients:"
print(top_features)
## [1] 83 19 144 88 12 75 204 142 104 37
# Plot feature importances
ggplot(data.frame(Index = 1:length(coef_importance), Importance = coef_importance),
aes(x = Index, y = Importance)) +
geom_point() +
geom_point(data = data.frame(Index = top_features, Importance = coef_importance[top_features]),
aes(x = Index, y = Importance), color = "red", size = 3) +
labs(title = "Wavelet Coefficient Importances", x = "Coefficient Index", y = "Absolute Coefficient Value") +
theme_minimal()

# Extract and plot the non-zero coefficients
non_zero_coeffs <- coef(model)[coef(model) != 0]
non_zero_indices <- which(coef(model) != 0)
# Create a data frame for plotting
coeff_data <- data.frame(Index = non_zero_indices, Coefficient = non_zero_coeffs)
# Plot the non-zero coefficients
ggplot(coeff_data, aes(x = Index, y = Coefficient)) +
geom_bar(stat = "identity") +
labs(title = "Non-zero Coefficients from Penalized Logistic Regression",
x = "Wavelet Coefficient Index",
y = "Coefficient Value") +
theme_minimal()

Question 5: Functional Data Analysis
library(gamair)
library(ggplot2)
library(fda)
rm(list = ls())
data(gas)
nir_spectra <- matrix(as.numeric(unlist(gas$NIR)), nrow = 60, byrow = TRUE)
colnames(nir_spectra) = colnames(gas$NIR)
nir_spectra <- t(nir_spectra) # Transpose to have 401 rows and 60 columns
octane <- gas$octane
# Define the basis
nir_basis <- create.bspline.basis(rangeval = c(900, 1700), nbasis = 20)
# Convert the spectra to functional data object
nir_fd <- smooth.basis(argvals = seq(900, 1700, by = 2), y = nir_spectra, fdParobj = nir_basis)$fd
plot(nir_fd, main = "NIR Spectra", xlab = "Wavelength (nm)", ylab = "log(1/R)")

## [1] "done"
# Get the coefficients of the functional data object
nir_fd_coefs <- t(eval.fd(seq(900, 1700, by = 2), nir_fd))
# Fit a linear model using the coefficients
lm_model <- lm(octane ~ nir_fd_coefs)
summary(lm_model)
##
## Call:
## lm(formula = octane ~ nir_fd_coefs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7141 -0.7166 0.1786 0.7055 1.9047
##
## Coefficients: (374 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.750e+01 3.831e-01 228.394 <2e-16 ***
## nir_fd_coefs1 1.044e+16 8.651e+15 1.207 0.2363
## nir_fd_coefs2 -1.499e+16 1.778e+16 -0.843 0.4052
## nir_fd_coefs3 -2.618e+15 1.643e+16 -0.159 0.8744
## nir_fd_coefs4 7.115e+15 8.911e+15 0.798 0.4305
## nir_fd_coefs5 NA NA NA NA
## nir_fd_coefs6 NA NA NA NA
## nir_fd_coefs7 NA NA NA NA
## nir_fd_coefs8 NA NA NA NA
## nir_fd_coefs9 NA NA NA NA
## nir_fd_coefs10 NA NA NA NA
## nir_fd_coefs11 NA NA NA NA
## nir_fd_coefs12 NA NA NA NA
## nir_fd_coefs13 NA NA NA NA
## nir_fd_coefs14 NA NA NA NA
## nir_fd_coefs15 NA NA NA NA
## nir_fd_coefs16 NA NA NA NA
## nir_fd_coefs17 NA NA NA NA
## nir_fd_coefs18 NA NA NA NA
## nir_fd_coefs19 NA NA NA NA
## nir_fd_coefs20 NA NA NA NA
## nir_fd_coefs21 NA NA NA NA
## nir_fd_coefs22 NA NA NA NA
## nir_fd_coefs23 NA NA NA NA
## nir_fd_coefs24 NA NA NA NA
## nir_fd_coefs25 NA NA NA NA
## nir_fd_coefs26 7.297e+14 7.204e+14 1.013 0.3187
## nir_fd_coefs27 NA NA NA NA
## nir_fd_coefs28 NA NA NA NA
## nir_fd_coefs29 NA NA NA NA
## nir_fd_coefs30 NA NA NA NA
## nir_fd_coefs31 NA NA NA NA
## nir_fd_coefs32 NA NA NA NA
## nir_fd_coefs33 NA NA NA NA
## nir_fd_coefs34 NA NA NA NA
## nir_fd_coefs35 NA NA NA NA
## nir_fd_coefs36 NA NA NA NA
## nir_fd_coefs37 NA NA NA NA
## nir_fd_coefs38 NA NA NA NA
## nir_fd_coefs39 NA NA NA NA
## nir_fd_coefs40 NA NA NA NA
## nir_fd_coefs41 NA NA NA NA
## nir_fd_coefs42 NA NA NA NA
## nir_fd_coefs43 NA NA NA NA
## nir_fd_coefs44 NA NA NA NA
## nir_fd_coefs45 NA NA NA NA
## nir_fd_coefs46 NA NA NA NA
## nir_fd_coefs47 NA NA NA NA
## nir_fd_coefs48 NA NA NA NA
## nir_fd_coefs49 -1.560e+16 1.230e+16 -1.268 0.2139
## nir_fd_coefs50 NA NA NA NA
## nir_fd_coefs51 1.588e+16 1.251e+16 1.269 0.2135
## nir_fd_coefs52 NA NA NA NA
## nir_fd_coefs53 NA NA NA NA
## nir_fd_coefs54 NA NA NA NA
## nir_fd_coefs55 NA NA NA NA
## nir_fd_coefs56 NA NA NA NA
## nir_fd_coefs57 NA NA NA NA
## nir_fd_coefs58 NA NA NA NA
## nir_fd_coefs59 NA NA NA NA
## nir_fd_coefs60 NA NA NA NA
## nir_fd_coefs61 NA NA NA NA
## nir_fd_coefs62 NA NA NA NA
## nir_fd_coefs63 NA NA NA NA
## nir_fd_coefs64 NA NA NA NA
## nir_fd_coefs65 NA NA NA NA
## nir_fd_coefs66 NA NA NA NA
## nir_fd_coefs67 NA NA NA NA
## nir_fd_coefs68 NA NA NA NA
## nir_fd_coefs69 NA NA NA NA
## nir_fd_coefs70 NA NA NA NA
## nir_fd_coefs71 NA NA NA NA
## nir_fd_coefs72 NA NA NA NA
## nir_fd_coefs73 -9.198e+14 9.519e+14 -0.966 0.3411
## nir_fd_coefs74 NA NA NA NA
## nir_fd_coefs75 NA NA NA NA
## nir_fd_coefs76 NA NA NA NA
## nir_fd_coefs77 NA NA NA NA
## nir_fd_coefs78 NA NA NA NA
## nir_fd_coefs79 NA NA NA NA
## nir_fd_coefs80 NA NA NA NA
## nir_fd_coefs81 NA NA NA NA
## nir_fd_coefs82 NA NA NA NA
## nir_fd_coefs83 NA NA NA NA
## nir_fd_coefs84 NA NA NA NA
## nir_fd_coefs85 NA NA NA NA
## nir_fd_coefs86 NA NA NA NA
## nir_fd_coefs87 NA NA NA NA
## nir_fd_coefs88 NA NA NA NA
## nir_fd_coefs89 NA NA NA NA
## nir_fd_coefs90 NA NA NA NA
## nir_fd_coefs91 NA NA NA NA
## nir_fd_coefs92 NA NA NA NA
## nir_fd_coefs93 NA NA NA NA
## nir_fd_coefs94 NA NA NA NA
## nir_fd_coefs95 NA NA NA NA
## nir_fd_coefs96 -1.701e+16 1.564e+16 -1.088 0.2848
## nir_fd_coefs97 1.783e+16 1.587e+16 1.124 0.2695
## nir_fd_coefs98 NA NA NA NA
## nir_fd_coefs99 NA NA NA NA
## nir_fd_coefs100 NA NA NA NA
## nir_fd_coefs101 NA NA NA NA
## nir_fd_coefs102 NA NA NA NA
## nir_fd_coefs103 NA NA NA NA
## nir_fd_coefs104 NA NA NA NA
## nir_fd_coefs105 NA NA NA NA
## nir_fd_coefs106 NA NA NA NA
## nir_fd_coefs107 NA NA NA NA
## nir_fd_coefs108 NA NA NA NA
## nir_fd_coefs109 NA NA NA NA
## nir_fd_coefs110 NA NA NA NA
## nir_fd_coefs111 NA NA NA NA
## nir_fd_coefs112 NA NA NA NA
## nir_fd_coefs113 NA NA NA NA
## nir_fd_coefs114 NA NA NA NA
## nir_fd_coefs115 NA NA NA NA
## nir_fd_coefs116 NA NA NA NA
## nir_fd_coefs117 NA NA NA NA
## nir_fd_coefs118 NA NA NA NA
## nir_fd_coefs119 NA NA NA NA
## nir_fd_coefs120 -2.020e+15 8.786e+14 -2.299 0.0282 *
## nir_fd_coefs121 NA NA NA NA
## nir_fd_coefs122 NA NA NA NA
## nir_fd_coefs123 NA NA NA NA
## nir_fd_coefs124 NA NA NA NA
## nir_fd_coefs125 NA NA NA NA
## nir_fd_coefs126 NA NA NA NA
## nir_fd_coefs127 NA NA NA NA
## nir_fd_coefs128 NA NA NA NA
## nir_fd_coefs129 NA NA NA NA
## nir_fd_coefs130 NA NA NA NA
## nir_fd_coefs131 NA NA NA NA
## nir_fd_coefs132 NA NA NA NA
## nir_fd_coefs133 NA NA NA NA
## nir_fd_coefs134 NA NA NA NA
## nir_fd_coefs135 NA NA NA NA
## nir_fd_coefs136 NA NA NA NA
## nir_fd_coefs137 NA NA NA NA
## nir_fd_coefs138 NA NA NA NA
## nir_fd_coefs139 NA NA NA NA
## nir_fd_coefs140 NA NA NA NA
## nir_fd_coefs141 NA NA NA NA
## nir_fd_coefs142 NA NA NA NA
## nir_fd_coefs143 NA NA NA NA
## nir_fd_coefs144 1.697e+16 9.796e+15 1.732 0.0928 .
## nir_fd_coefs145 NA NA NA NA
## nir_fd_coefs146 NA NA NA NA
## nir_fd_coefs147 -1.733e+16 1.026e+16 -1.690 0.1007
## nir_fd_coefs148 NA NA NA NA
## nir_fd_coefs149 NA NA NA NA
## nir_fd_coefs150 NA NA NA NA
## nir_fd_coefs151 NA NA NA NA
## nir_fd_coefs152 NA NA NA NA
## nir_fd_coefs153 NA NA NA NA
## nir_fd_coefs154 NA NA NA NA
## nir_fd_coefs155 NA NA NA NA
## nir_fd_coefs156 NA NA NA NA
## nir_fd_coefs157 NA NA NA NA
## nir_fd_coefs158 NA NA NA NA
## nir_fd_coefs159 NA NA NA NA
## nir_fd_coefs160 NA NA NA NA
## nir_fd_coefs161 NA NA NA NA
## nir_fd_coefs162 NA NA NA NA
## nir_fd_coefs163 NA NA NA NA
## nir_fd_coefs164 NA NA NA NA
## nir_fd_coefs165 NA NA NA NA
## nir_fd_coefs166 NA NA NA NA
## nir_fd_coefs167 5.949e+14 2.125e+15 0.280 0.7813
## nir_fd_coefs168 NA NA NA NA
## nir_fd_coefs169 NA NA NA NA
## nir_fd_coefs170 NA NA NA NA
## nir_fd_coefs171 NA NA NA NA
## nir_fd_coefs172 NA NA NA NA
## nir_fd_coefs173 NA NA NA NA
## nir_fd_coefs174 NA NA NA NA
## nir_fd_coefs175 NA NA NA NA
## nir_fd_coefs176 NA NA NA NA
## nir_fd_coefs177 NA NA NA NA
## nir_fd_coefs178 NA NA NA NA
## nir_fd_coefs179 NA NA NA NA
## nir_fd_coefs180 NA NA NA NA
## nir_fd_coefs181 NA NA NA NA
## nir_fd_coefs182 NA NA NA NA
## nir_fd_coefs183 NA NA NA NA
## nir_fd_coefs184 NA NA NA NA
## nir_fd_coefs185 NA NA NA NA
## nir_fd_coefs186 NA NA NA NA
## nir_fd_coefs187 NA NA NA NA
## nir_fd_coefs188 NA NA NA NA
## nir_fd_coefs189 NA NA NA NA
## nir_fd_coefs190 NA NA NA NA
## nir_fd_coefs191 1.466e+16 1.835e+16 0.799 0.4302
## nir_fd_coefs192 NA NA NA NA
## nir_fd_coefs193 NA NA NA NA
## nir_fd_coefs194 NA NA NA NA
## nir_fd_coefs195 NA NA NA NA
## nir_fd_coefs196 NA NA NA NA
## nir_fd_coefs197 -1.807e+16 2.204e+16 -0.820 0.4184
## nir_fd_coefs198 NA NA NA NA
## nir_fd_coefs199 NA NA NA NA
## nir_fd_coefs200 NA NA NA NA
## nir_fd_coefs201 NA NA NA NA
## nir_fd_coefs202 NA NA NA NA
## nir_fd_coefs203 NA NA NA NA
## nir_fd_coefs204 NA NA NA NA
## nir_fd_coefs205 NA NA NA NA
## nir_fd_coefs206 NA NA NA NA
## nir_fd_coefs207 NA NA NA NA
## nir_fd_coefs208 NA NA NA NA
## nir_fd_coefs209 NA NA NA NA
## nir_fd_coefs210 NA NA NA NA
## nir_fd_coefs211 NA NA NA NA
## nir_fd_coefs212 NA NA NA NA
## nir_fd_coefs213 NA NA NA NA
## nir_fd_coefs214 4.365e+15 6.249e+15 0.698 0.4900
## nir_fd_coefs215 NA NA NA NA
## nir_fd_coefs216 NA NA NA NA
## nir_fd_coefs217 NA NA NA NA
## nir_fd_coefs218 NA NA NA NA
## nir_fd_coefs219 NA NA NA NA
## nir_fd_coefs220 NA NA NA NA
## nir_fd_coefs221 NA NA NA NA
## nir_fd_coefs222 NA NA NA NA
## nir_fd_coefs223 NA NA NA NA
## nir_fd_coefs224 NA NA NA NA
## nir_fd_coefs225 NA NA NA NA
## nir_fd_coefs226 NA NA NA NA
## nir_fd_coefs227 NA NA NA NA
## nir_fd_coefs228 NA NA NA NA
## nir_fd_coefs229 NA NA NA NA
## nir_fd_coefs230 NA NA NA NA
## nir_fd_coefs231 NA NA NA NA
## nir_fd_coefs232 NA NA NA NA
## nir_fd_coefs233 NA NA NA NA
## nir_fd_coefs234 NA NA NA NA
## nir_fd_coefs235 NA NA NA NA
## nir_fd_coefs236 NA NA NA NA
## nir_fd_coefs237 NA NA NA NA
## nir_fd_coefs238 7.822e+15 4.979e+15 1.571 0.1260
## nir_fd_coefs239 NA NA NA NA
## nir_fd_coefs240 NA NA NA NA
## nir_fd_coefs241 NA NA NA NA
## nir_fd_coefs242 NA NA NA NA
## nir_fd_coefs243 NA NA NA NA
## nir_fd_coefs244 NA NA NA NA
## nir_fd_coefs245 NA NA NA NA
## nir_fd_coefs246 NA NA NA NA
## nir_fd_coefs247 NA NA NA NA
## nir_fd_coefs248 -1.494e+16 9.842e+15 -1.518 0.1389
## nir_fd_coefs249 NA NA NA NA
## nir_fd_coefs250 NA NA NA NA
## nir_fd_coefs251 NA NA NA NA
## nir_fd_coefs252 NA NA NA NA
## nir_fd_coefs253 NA NA NA NA
## nir_fd_coefs254 NA NA NA NA
## nir_fd_coefs255 NA NA NA NA
## nir_fd_coefs256 NA NA NA NA
## nir_fd_coefs257 NA NA NA NA
## nir_fd_coefs258 NA NA NA NA
## nir_fd_coefs259 NA NA NA NA
## nir_fd_coefs260 NA NA NA NA
## nir_fd_coefs261 8.855e+15 5.970e+15 1.483 0.1478
## nir_fd_coefs262 NA NA NA NA
## nir_fd_coefs263 NA NA NA NA
## nir_fd_coefs264 NA NA NA NA
## nir_fd_coefs265 NA NA NA NA
## nir_fd_coefs266 NA NA NA NA
## nir_fd_coefs267 NA NA NA NA
## nir_fd_coefs268 NA NA NA NA
## nir_fd_coefs269 NA NA NA NA
## nir_fd_coefs270 NA NA NA NA
## nir_fd_coefs271 NA NA NA NA
## nir_fd_coefs272 NA NA NA NA
## nir_fd_coefs273 NA NA NA NA
## nir_fd_coefs274 NA NA NA NA
## nir_fd_coefs275 NA NA NA NA
## nir_fd_coefs276 NA NA NA NA
## nir_fd_coefs277 NA NA NA NA
## nir_fd_coefs278 NA NA NA NA
## nir_fd_coefs279 NA NA NA NA
## nir_fd_coefs280 NA NA NA NA
## nir_fd_coefs281 NA NA NA NA
## nir_fd_coefs282 NA NA NA NA
## nir_fd_coefs283 NA NA NA NA
## nir_fd_coefs284 NA NA NA NA
## nir_fd_coefs285 -2.659e+15 1.803e+15 -1.475 0.1500
## nir_fd_coefs286 NA NA NA NA
## nir_fd_coefs287 NA NA NA NA
## nir_fd_coefs288 NA NA NA NA
## nir_fd_coefs289 NA NA NA NA
## nir_fd_coefs290 NA NA NA NA
## nir_fd_coefs291 NA NA NA NA
## nir_fd_coefs292 NA NA NA NA
## nir_fd_coefs293 NA NA NA NA
## nir_fd_coefs294 NA NA NA NA
## nir_fd_coefs295 NA NA NA NA
## nir_fd_coefs296 NA NA NA NA
## nir_fd_coefs297 NA NA NA NA
## nir_fd_coefs298 NA NA NA NA
## nir_fd_coefs299 NA NA NA NA
## nir_fd_coefs300 NA NA NA NA
## nir_fd_coefs301 NA NA NA NA
## nir_fd_coefs302 1.592e+15 1.080e+15 1.475 0.1500
## nir_fd_coefs303 NA NA NA NA
## nir_fd_coefs304 NA NA NA NA
## nir_fd_coefs305 NA NA NA NA
## nir_fd_coefs306 NA NA NA NA
## nir_fd_coefs307 NA NA NA NA
## nir_fd_coefs308 -6.892e+14 4.673e+14 -1.475 0.1500
## nir_fd_coefs309 NA NA NA NA
## nir_fd_coefs310 NA NA NA NA
## nir_fd_coefs311 NA NA NA NA
## nir_fd_coefs312 NA NA NA NA
## nir_fd_coefs313 NA NA NA NA
## nir_fd_coefs314 NA NA NA NA
## nir_fd_coefs315 NA NA NA NA
## nir_fd_coefs316 NA NA NA NA
## nir_fd_coefs317 NA NA NA NA
## nir_fd_coefs318 NA NA NA NA
## nir_fd_coefs319 NA NA NA NA
## nir_fd_coefs320 NA NA NA NA
## nir_fd_coefs321 NA NA NA NA
## nir_fd_coefs322 NA NA NA NA
## nir_fd_coefs323 NA NA NA NA
## nir_fd_coefs324 NA NA NA NA
## nir_fd_coefs325 NA NA NA NA
## nir_fd_coefs326 NA NA NA NA
## nir_fd_coefs327 NA NA NA NA
## nir_fd_coefs328 NA NA NA NA
## nir_fd_coefs329 NA NA NA NA
## nir_fd_coefs330 NA NA NA NA
## nir_fd_coefs331 NA NA NA NA
## nir_fd_coefs332 6.192e+10 4.079e+10 1.518 0.1388
## nir_fd_coefs333 NA NA NA NA
## nir_fd_coefs334 NA NA NA NA
## nir_fd_coefs335 NA NA NA NA
## nir_fd_coefs336 NA NA NA NA
## nir_fd_coefs337 NA NA NA NA
## nir_fd_coefs338 NA NA NA NA
## nir_fd_coefs339 NA NA NA NA
## nir_fd_coefs340 NA NA NA NA
## nir_fd_coefs341 NA NA NA NA
## nir_fd_coefs342 NA NA NA NA
## nir_fd_coefs343 NA NA NA NA
## nir_fd_coefs344 NA NA NA NA
## nir_fd_coefs345 NA NA NA NA
## nir_fd_coefs346 NA NA NA NA
## nir_fd_coefs347 NA NA NA NA
## nir_fd_coefs348 NA NA NA NA
## nir_fd_coefs349 NA NA NA NA
## nir_fd_coefs350 NA NA NA NA
## nir_fd_coefs351 NA NA NA NA
## nir_fd_coefs352 NA NA NA NA
## nir_fd_coefs353 NA NA NA NA
## nir_fd_coefs354 NA NA NA NA
## nir_fd_coefs355 -2.839e+09 1.094e+10 -0.260 0.7968
## nir_fd_coefs356 2.513e+09 9.736e+09 0.258 0.7980
## nir_fd_coefs357 NA NA NA NA
## nir_fd_coefs358 NA NA NA NA
## nir_fd_coefs359 NA NA NA NA
## nir_fd_coefs360 NA NA NA NA
## nir_fd_coefs361 NA NA NA NA
## nir_fd_coefs362 NA NA NA NA
## nir_fd_coefs363 NA NA NA NA
## nir_fd_coefs364 NA NA NA NA
## nir_fd_coefs365 NA NA NA NA
## nir_fd_coefs366 NA NA NA NA
## nir_fd_coefs367 NA NA NA NA
## nir_fd_coefs368 NA NA NA NA
## nir_fd_coefs369 NA NA NA NA
## nir_fd_coefs370 NA NA NA NA
## nir_fd_coefs371 NA NA NA NA
## nir_fd_coefs372 NA NA NA NA
## nir_fd_coefs373 NA NA NA NA
## nir_fd_coefs374 NA NA NA NA
## nir_fd_coefs375 NA NA NA NA
## nir_fd_coefs376 NA NA NA NA
## nir_fd_coefs377 NA NA NA NA
## nir_fd_coefs378 -1.332e+06 5.170e+06 -0.258 0.7983
## nir_fd_coefs379 NA NA NA NA
## nir_fd_coefs380 NA NA NA NA
## nir_fd_coefs381 NA NA NA NA
## nir_fd_coefs382 NA NA NA NA
## nir_fd_coefs383 NA NA NA NA
## nir_fd_coefs384 NA NA NA NA
## nir_fd_coefs385 NA NA NA NA
## nir_fd_coefs386 NA NA NA NA
## nir_fd_coefs387 NA NA NA NA
## nir_fd_coefs388 NA NA NA NA
## nir_fd_coefs389 NA NA NA NA
## nir_fd_coefs390 NA NA NA NA
## nir_fd_coefs391 NA NA NA NA
## nir_fd_coefs392 NA NA NA NA
## nir_fd_coefs393 NA NA NA NA
## nir_fd_coefs394 NA NA NA NA
## nir_fd_coefs395 NA NA NA NA
## nir_fd_coefs396 NA NA NA NA
## nir_fd_coefs397 NA NA NA NA
## nir_fd_coefs398 NA NA NA NA
## nir_fd_coefs399 NA NA NA NA
## nir_fd_coefs400 NA NA NA NA
## nir_fd_coefs401 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.391 on 32 degrees of freedom
## Multiple R-squared: 0.5519, Adjusted R-squared: 0.1739
## F-statistic: 1.46 on 27 and 32 DF, p-value: 0.1519
# Predict octane ratings
octane_pred <- predict(lm_model)
results <- data.frame(Actual = octane, Predicted = octane_pred)
# Plot actual vs predicted values using ggplot
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(color = 'blue') +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Actual vs Predicted Octane Ratings",
x = "Actual Octane Rating",
y = "Predicted Octane Rating") +
theme_minimal()

coefficients <- coef(lm_model)
coef_data <- data.frame(Index = 1:length(coefficients), Coefficient = coefficients)
# Plot the coefficients using ggplot
ggplot(coef_data, aes(x = Index, y = Coefficient)) +
geom_bar(stat = "identity") +
labs(title = "Coefficients of the NIR Spectra",
x = "Coefficient Index",
y = "Coefficient Value") +
theme_minimal()

library(gamair)
library(ggplot2)
library(splines)
rm(list = ls())
data(gas)
# Extract the spectra and octane ratings
nir_spectra <- matrix(as.numeric(unlist(gas$NIR)), nrow = 60, byrow = TRUE)
colnames(nir_spectra) <- colnames(gas$NIR)
nir_spectra <- t(nir_spectra) # Transpose to have 401 rows and 60 columns
octane <- gas$octane
# Define the B-spline basis functions
basis_functions <- bs(seq(900, 1700, by = 2), df = 20, degree = 3, intercept = TRUE)
basis_expansion <- function(data, basis) {
expanded_data <- matrix(0, nrow = ncol(basis), ncol = ncol(data))
for (i in 1:ncol(data)) {
expanded_data[, i] <- t(basis) %*% data[, i]
}
return(expanded_data)
}
# Apply basis expansion
nir_fd_coefs <- basis_expansion(nir_spectra, basis_functions)
nir_fd_coefs_df <- as.data.frame(t(nir_fd_coefs))
colnames(nir_fd_coefs_df) <- paste0("X", 1:ncol(nir_fd_coefs_df))
model_data <- cbind(octane = octane, nir_fd_coefs_df)
# Fit the linear model
formula <- as.formula(paste("octane ~", paste(colnames(nir_fd_coefs_df), collapse = " + "), "- 1"))
lm_model <- lm(formula, data = model_data)
summary(lm_model)
##
## Call:
## lm(formula = formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.37 13.12 54.24 81.60 130.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X1 -1907.9 1025.8 -1.860 0.0703 .
## X2 1149.6 1948.9 0.590 0.5586
## X3 -656.2 2360.8 -0.278 0.7825
## X4 949.0 1831.7 0.518 0.6072
## X5 570.8 2122.2 0.269 0.7893
## X6 -1459.1 1701.9 -0.857 0.3964
## X7 -190.4 2276.4 -0.084 0.9338
## X8 -595.2 2231.0 -0.267 0.7910
## X9 1507.7 2110.0 0.715 0.4790
## X10 426.0 3048.5 0.140 0.8896
## X11 1389.5 2558.7 0.543 0.5901
## X12 -3365.9 2972.6 -1.132 0.2643
## X13 -132.3 2817.8 -0.047 0.9628
## X14 -351.4 2508.1 -0.140 0.8893
## X15 4001.0 2626.5 1.523 0.1356
## X16 -2283.2 2597.2 -0.879 0.3846
## X17 744.2 2049.0 0.363 0.7184
## X18 -2820.2 2314.7 -1.218 0.2302
## X19 2217.7 1714.6 1.293 0.2033
## X20 889.6 913.6 0.974 0.3360
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83.7 on 40 degrees of freedom
## Multiple R-squared: 0.3856, Adjusted R-squared: 0.07842
## F-statistic: 1.255 on 20 and 40 DF, p-value: 0.2637
# First principles prediction using the fitted coefficients
fitted_coefficients <- coef(lm_model)
octane_pred_manual <- as.matrix(nir_fd_coefs_df) %*% fitted_coefficients
results <- data.frame(Actual = octane, Predicted = octane_pred_manual)
# Plot actual vs predicted values
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(color = 'blue') +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Actual vs Predicted Octane Ratings",
x = "Actual Octane Rating",
y = "Predicted Octane Rating") +
theme_minimal()

print(fitted_coefficients)
## X1 X2 X3 X4 X5 X6 X7
## -1907.9197 1149.6367 -656.1988 948.9994 570.8192 -1459.0755 -190.3914
## X8 X9 X10 X11 X12 X13 X14
## -595.2310 1507.6761 426.0391 1389.5247 -3365.8842 -132.2741 -351.4226
## X15 X16 X17 X18 X19 X20
## 4000.9962 -2283.1757 744.1567 -2820.2456 2217.7276 889.6180
nir_spectra_df <- as.data.frame(nir_spectra)
nir_spectra_df$Wavelength <- seq(900, 1700, by = 2)
nir_spectra_long <- tidyr::pivot_longer(nir_spectra_df, cols = -Wavelength, names_to = "Sample", values_to = "log1R")
ggplot(nir_spectra_long, aes(x = Wavelength, y = log1R, group = Sample, color = Sample)) +
geom_line(alpha = 0.5) +
labs(title = "NIR Spectra",
x = "Wavelength (nm)",
y = "log(1/R)") +
theme_minimal() +
theme(legend.position = "none")
